{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# install kubeflow-training extra 'huggingface'\n",
    "!pip install -U 'kubeflow-training[huggingface]'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the libraries\n",
    "from kubeflow.training.api.training_client import TrainingClient\n",
    "from kubeflow.storage_initializer.hugging_face import (\n",
    "    HuggingFaceModelParams,\n",
    "    HuggingFaceTrainerParams,\n",
    ")\n",
    "from kubeflow.storage_initializer.s3 import S3DatasetParams\n",
    "from kubeflow.storage_initializer.constants import INIT_CONTAINER_MOUNT_PATH\n",
    "from peft import LoraConfig\n",
    "import transformers\n",
    "from transformers import TrainingArguments\n",
    "from kubeflow.training import constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a training client, pass config_file parameter if you want to use kubeconfig other than \"~/.kube/config\"\n",
    "client = TrainingClient()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "USING S3 AS THE DATASET SOURCE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Need to set S3 credentials\n",
    "s3_access_key = \"\"\n",
    "s3_secret_key = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mention the model, datasets and training parameters\n",
    "client.train(\n",
    "    name=\"s3-test\",\n",
    "    num_workers=2,\n",
    "    num_procs_per_worker=1,\n",
    "    # specify the storage class if you don't want to use the default one for the storage-initializer PVC\n",
    "    # storage_config={\n",
    "    #     \"size\": \"10Gi\",\n",
    "    #     \"storage_class\": \"<your storage class>\",\n",
    "    # },\n",
    "    model_provider_parameters=HuggingFaceModelParams(\n",
    "        model_uri=\"hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0\",\n",
    "        transformer_type=transformers.AutoModelForCausalLM,\n",
    "    ),\n",
    "    # it is assumed for text related tasks, you have 'text' column in the dataset.\n",
    "    # for more info on how dataset is loaded check load_and_preprocess_data function in sdk/python/kubeflow/trainer/hf_llm_training.py\n",
    "    dataset_provider_parameters=S3DatasetParams(\n",
    "        endpoint_url=\"http://10.117.63.3\",\n",
    "        bucket_name=\"test\",\n",
    "        file_key=\"imdatta0___ultrachat_1k\",\n",
    "        region_name=\"us-east-1\",\n",
    "        access_key=s3_access_key,\n",
    "        secret_key=s3_secret_key,\n",
    "    ),\n",
    "    trainer_parameters=HuggingFaceTrainerParams(\n",
    "        lora_config=LoraConfig(\n",
    "            r=8,\n",
    "            lora_alpha=8,\n",
    "            lora_dropout=0.2,\n",
    "            bias=\"none\",\n",
    "            task_type=\"CAUSAL_LM\",\n",
    "        ),\n",
    "        training_parameters=TrainingArguments(\n",
    "            num_train_epochs=1,\n",
    "            per_device_train_batch_size=1,\n",
    "            gradient_accumulation_steps=1,\n",
    "            gradient_checkpointing=True,\n",
    "            gradient_checkpointing_kwargs={\n",
    "                \"use_reentrant\": False\n",
    "            },  # this is mandatory if checkpointng is enabled\n",
    "            warmup_steps=0.02,\n",
    "            learning_rate=1,\n",
    "            lr_scheduler_type=\"cosine\",\n",
    "            bf16=False,\n",
    "            logging_steps=0.01,\n",
    "            output_dir=INIT_CONTAINER_MOUNT_PATH,\n",
    "            optim=f\"sgd\",\n",
    "            save_steps=0.01,\n",
    "            save_total_limit=3,\n",
    "            disable_tqdm=False,\n",
    "            resume_from_checkpoint=True,\n",
    "            remove_unused_columns=True,\n",
    "            ddp_backend=\"nccl\",  # change the backend to gloo if you want cpu based training and remove the gpu key in resources_per_worker\n",
    "        ),\n",
    "    ),\n",
    "    resources_per_worker={\n",
    "        \"gpu\": 1,\n",
    "        \"cpu\": 8,\n",
    "        \"memory\": \"8Gi\",\n",
    "    },  # remove the gpu key if you don't want to attach gpus to the pods\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the logs of the job\n",
    "client.get_job_logs(name=\"s3-test\", job_kind=constants.PYTORCHJOB_KIND)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "myenv3.11",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
